/***************************************************************************
 *
 * Copyright (C) 2001 International Business Machines
 * All rights reserved.
 *
 * This file is part of the GPFS mmfslinux kernel module.
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met:
 *
 *  1. Redistributions of source code must retain the above copyright notice, 
 *     this list of conditions and the following disclaimer. 
 *  2. Redistributions in binary form must reproduce the above copyright 
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution. 
 *  3. The name of the author may not be used to endorse or promote products 
 *     derived from this software without specific prior written
 *     permission. 
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *************************************************************************** */
/*
 * $Id: mmap.c,v 1.24.2.9 2002/08/07 20:02:38 mcnabb Exp $
 *
 * $Log: mmap.c,v $
 * Revision 1.24.2.9  2002/08/07 20:02:38  mcnabb
 * Node crashed in flush_vma while getting page table entry from bad pmd
 * offset. This is an old bug. But, for some reason we did not catch it
 * before. Make sure that the pmd is valid before getting page table entry.
 *
 * Revision 1.24.2.8  2002/07/31 21:00:26  gjertsen
 * Only include Linux mmap patch if MMAP_LINUX_PATCH is toggled on in
 * LINUX_PATCH_DEFINES.
 *
 * Revision 1.24.2.7  2002/07/31 16:22:34  yuri
 * Fix build break (variable rename).
 *
 * Revision 1.24.2.6  2002/07/31 15:05:18  gjertsen
 * Revise mmap patch to handle 2.4.18-3 or later versions.
 *
 * Revision 1.24.2.5  2002/07/31 04:51:58  mcnabb
 * Linux 2.4.18-5: Before clearing pte entry, call page_remove_rmap() to
 * remove pte entry from page pte chain.
 *
 * Revision 1.24.2.4  2002/07/12 22:27:34  mcnabb
 * After calling filemap_fdatasync(), need to wait until all dirty mmaped
 * pages are flushed disk.
 *
 * Revision 1.24.2.3  2002/05/21 21:44:58  dcraft
 * Pull GPFS 1.2.1 up to kernel 2.4.18.
 * mmfsfuncs.Linux must be distributed with /usr/lpp/mmfs/src
 * on developerworks.
 *
 * Revision 1.24.2.2  2002/01/25 04:15:10  mcnabb
 * Defect 357492:
 * Flush dirty pages before terminating mmap.
 *
 * Revision 1.24.2.1  2001/11/21 07:52:53  mcnabb
 * Defect 353917:
 * vm_ops->close() always be done after calling vm_ops->open() even if
 * returns error. Because of this Linux semantics we need to increment
 * mmap counters even if an error occurs.
 *
 * Revision 1.24  2001/10/03 14:46:14  dcraft
 * First attempt to bring us up to 2.4.9 and 2.4.10
 *
 * Revision 1.23  2001/09/25 19:02:38  gjertsen
 * Some IA64 code cleanup. Suppress unwanted compiler warnings.
 *
 * Revision 1.22  2001/09/25 14:27:51  dcraft
 * Comment change.  No code change.
 *
 * Revision 1.21  2001/09/24 13:15:14  radhak
 * Got rid of dead code.
 *
 * Revision 1.20  2001/08/21 01:56:27  tee
 * Make AIX and Linux versions of buf queue handling be the same.
 *
 * Revision 1.19  2001/08/20 15:02:16  gjertsen
 * Avoid calling flush_tlb_page for IA64 for now (not exported by kernel).
 *
 * Revision 1.18  2001/08/18 02:11:30  tee
 * Improve matching between AIX and Linux mmap code to prepare for source file
 * merge.
 *
 * Revision 1.17  2001/08/16 01:26:24  tee
 * Fix dynamic mmap kproc starting and stopping on Linux.  Don't start processes
 * that only live for the duration of a page fault.  Use the daemon startup
 * thread as a master controller for the kprocs, which will wait on them when
 * they exit to keep them from becoming zombies.
 *
 * Revision 1.16  2001/08/14 02:53:42  tee
 * Don't dereference NULL vinfo pointer on write requests.  Return error to
 * page fault hander if read fails rather than allowing application to use
 * whatever happens to be in the buffer.  Don't clear PageUptodate flag on
 * write requests.  Don't use PageError flag.
 *
 * Revision 1.15  2001/08/13 20:40:51  schmuck
 * Partial fix for Defect 345614 (Raleigh Defect 4423):
 *   The sendfile() system call generates paging requests without an explicit
 * mmap().  When the file is closed, paging requests for read-ahead that were
 * generated as a side effect of sendfile() may still be queued and were not
 * being cleaned up.  A pager kproc would then process these requests using a
 * pointer to the old vinfo structure, which could have been reused for a
 * different file.  This caused inconsistencies in our prefetch buffer
 * accounting, eventually leading to the observed asserts.
 *   Add a flag to the vinfo structure to remember whether we received paging
 * requests for this instance.  On close, check the flag and call mmFlushseg
 * to cancel pending requests if necessary.
 *   Still missing: close needs to wait if a paging request from the instance
 * is currently being processed by a pager kproc.
 *
 * Revision 1.14  2001/08/10 22:33:03  tee
 * Declare functions with correct argument types to avoid unnecessary casting.
 * Cosmetic changes to make AIX and Linux mmap code more similar.
 *
 * Revision 1.13  2001/08/10 22:31:09  tee
 * Don't trace mmap buffer as string since it could contain unprintable
 * characters and might not be terminated.
 *
 * Revision 1.12  2001/08/10 18:22:32  gjertsen
 * Put errno predeclaration for kernel code in Shark-gpl.h.
 *
 * Revision 1.11  2001/08/09 21:11:22  dcraft
 * Modifications to allow running on latest Redhat 7.1 update
 * Kernel version 2.4.3-12.
 * Requires checkout of new site.mcr.proto
 *
 * Revision 1.10  2001/08/04 00:42:27  tee
 * Remove LINUX_MMAP ifdefs
 *
 * Revision 1.9  2001/08/01 20:20:29  tee
 * Increase trace level in flush_vma
 *
 * Revision 1.8  2001/07/21 00:30:55  tee
 * Invalidate memory mapped pages when a byte-range write lock is acquired,
 * unless it is being acquired to service a page fault.  This fixes the
 * problem of stale data being seen sometimes by an mmap application even
 * after a sync.  Also, add new option to kxMmapFlush to discard data without
 * flushing.  Use this in trunc to invalidate mapped pages, since flushing is
 * not necessary for pages in the truncated region.
 *
 * Revision 1.7  2001/07/19 17:39:06  manoj
 * Allow readpage to service sendfile(). Not complete (need to invalidate Linux's
 * cached inode pages, either on return from readpage or on a wx/sx BRL request).
 *
 * Revision 1.6  2001/05/22 13:59:00  radhak
 * Initialize start address before invalidating mmaped pages.
 *
 * Revision 1.5  2001/05/18 19:24:00  radhak
 * Defect 339937:
 * Enabled single node mmap semantics and fixed deadlock found by Tom
 *
 * Revision 1.4  2001/05/08 13:40:57  dixonbp
 * Release the file table inherrited from mmfsd at the top of
 * kernel threads (in pagerKprocMainThreadP).
 *
 * Revision 1.3  2001/05/04 13:41:27  radhak
 * Defect 338936: cont.
 * Moved __WCLONE flag mmap.c
 *
 * Revision 1.2  2001/05/03 20:50:42  radhak
 * Defect 338936:
 * Linux mmap: PagerKproc::adjustKprocs:
 * Need to get rid of kernel thread zombies
 *
 * Revision 1.1  2001/04/02 14:33:16  dixonbp
 * Convert mmap.C to mmap.c
 *
 * Revision 1.18  2001/03/01 20:44:23  radhak
 * Need serialization between nopage and mmap flush.
 * Also, always get page table lock while holding page lock.
 *
 * Revision 1.15  2001/01/09 19:14:45  radhak
 * LINUX_MMAP: enable waitForFreeEntry()
 *
 * Revision 1.14  2000/12/29 22:22:35  radhak
 * Defect 322452: Before calling gpfs_filemap_sync get lock.
 * Also added some traces.
 *
 * Revision 1.13  2000/12/19 16:10:28  wyllie
 * Move Linux mmap declarations out of platform-independent code
 *
 * Revision 1.12  2000/12/15 13:56:46  gjertsen
 * Clean up documentation.
 *
 * Revision 1.11  2000/12/12 17:46:15  wyllie
 * Run cindent, add spaces after commas.  No functional changes.
 *
 * Revision 1.10  2000/12/03 01:48:10  radhak
 * LINUX_MMAP: mmap flush
 *
 * Revision 1.9  2000/12/01 02:10:59  schmuck
 * Instead of assigning NULL function pointers when initializing or resetting the
 * gpfs_operations table, have it point to a dummy function that returns ENOSYS.
 * This avoids having to check for NULL before each call.
 *
 * Revision 1.8  2000/11/17 16:13:18  radhak
 * Fix some compile error and remove the POLL_FREE macro
 *
 * Revision 1.7  2000/11/17 14:28:05  radhak
 * Removed relMmapCredP.
 *
 * Revision 1.6  2000/11/10 22:45:51  radhak
 * Removed all mmap printks and added one more function to symbol table.
 *
 * Revision 1.4  2000/11/08 01:10:21  radhak
 * More linux mmap code
 *
 * Revision 1.3  2000/11/06 19:56:15  gjertsen
 * Linux code cleanup and put in build safeguards.
 *
 * Revision 1.2  2000/11/02 19:46:28  gjertsen
 * Linux code split. Pull out NBD stuff.
 *
 */

#include <Shark-gpl.h>
#include <arch-gpl.h>

#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/module.h>
#include <asm/pgalloc.h>
#include <linux/mman.h>
#include <linux/file.h>
#include <linux/sched.h>
#include <linux/smp_lock.h>

#include <cxiSystem.h>
#include <cxi2gpfs.h>
#include <cxiMmap.h>
#include <linux2gpfs.h>
#include <cxi2gpfs.h>
#include <Trace.h>

#define INTPAGER 10

/*of list of bufs to be processed.  Bufs are placed on this list
   by the strategy routine (which runs at interrupt level) and removed
   by pager kprocs.  This list is circular and doubly-linked using
   av_forw and av_back. */

cxibuf_t Page_queue[MAX_PAGEQUE_ENTRIES];
cxibuf_t NextFreePageEntry;
cxibuf_t PageRunQueue;

/* A lock to control access to the list of bufs to be processed.  Since the
   list is modified from interrupt level, disable_lock() must be used. */
Simple_lock pagerLock;

/* True if the pager kprocs have been started */
Boolean kprocsRunning = false;

/* If this flag is true, then all pagein or pageout requests will be returned
   immediately with an error. */
Boolean mmapAbort = false;

/* Event word which pager kprocs will wait on */
cxiWaitEvent_t pagerEventWord;

/* Event word to wait for NextFreePageEntry */
cxiWaitEvent_t FreeEventWord;
Boolean waitingForFreeEntries = false;

/* LockWord and EventWord to serialize nopage() and mmapFlushseg() */
cxiBlockingMutex_t mmapFlushLockWord;
cxiWaitEvent_t mmapFlushEventWord;

#if LINUX_KERNEL_VERSION < 2041800
void 
flush_dirty_pages(struct inode * inode)
{
  struct list_head *tmp;
  struct page *page;
  int error;
  spinlock_t *page_hash_lock;
  struct address_space *mapping = inode->i_mapping;

try_again:
  TRACE0(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_DIRTY_PAGES_1,
         "flush_dirty_pages: getting address mapping lock\n");
  spin_lock(&mapping->page_lock);

  while ( !list_empty(&mapping->dirty_pages) )
  {
    tmp = mapping->dirty_pages.next;
    page = list_entry(tmp, struct page, list);

    TRACE4(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_DIRTY_PAGES_2,
           "flush_dirty_pages: page 0x%0X count %d index %d flags 0x%08X\n",
           page, page_count(page), page->index, page->flags);

    page_hash_lock = PAGECACHE_LOCK(page);
    if (!spin_trylock(page_hash_lock)) 
    {
      spin_unlock(&mapping->page_lock);
      goto try_again;
    }

    /* Move this page from dirty list to locked list */
    list_del(&page->list);
    list_add(&page->list, &mapping->locked_pages);
    if (!PageDirty(page)) 
    {
      spin_unlock(page_hash_lock);
      continue;
    }

    page_cache_get(page);
    spin_unlock(page_hash_lock);
    spin_unlock(&mapping->page_lock);

    if (PageLocked(page))
    {
      spin_lock(&mapping->page_lock);
      continue;
    }

    lock_page(page);
    if (PageDirty(page)) 
    {
      ClearPageDirty(page);
      if (page->mapping->a_ops->writepage)
      {
#if LINUX_KERNEL_VERSION >= 2040000
        error = page->mapping->a_ops->writepage(page);
        wait_on_page(page);
#else
        error = page->mapping->a_ops->writepage(vma->vm_file,page);
#endif
      }
    } 
    else
      UnlockPage(page);

    page_cache_release(page);
    spin_lock(&mapping->page_lock);
  }
  TRACE0(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_DIRTY_PAGES_6,
         "flush_dirty_pages:exit\n");
  spin_unlock(&mapping->page_lock);
}
#endif /* < 2041800 */

void mmapFlushLock(cxiNode_t *cnP,char *buf)
{
  cxiBlockingMutexAcquire(&mmapFlushLockWord);
  while(cnP->mmapFlush)
  {
    TRACE1(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_LOCK_A,
           "waiting for %s to finish\n",buf);
    cxiWaitEventWait(&mmapFlushEventWord, &mmapFlushLockWord, 0);
  }
  cnP->mmapFlush = true;
  cxiBlockingMutexRelease(&mmapFlushLockWord);
  TRACE1(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_LOCK_B,
           "%s got mmapFlushLock\n",buf);
}

void mmapFlushUnLock(cxiNode_t *cnP,char *buf)
{
  cxiBlockingMutexAcquire(&mmapFlushLockWord);
  cnP->mmapFlush = false;
  cxiWaitEventSignal(&mmapFlushEventWord);
  cxiBlockingMutexRelease(&mmapFlushLockWord);
  TRACE1(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_UNLOCK_A,
         "%s released mmapFlushLock\n",buf);
}
     
static inline int invalidate_mmap_page(pte_t * ptep, 
                                       struct vm_area_struct *vma,
                                       struct page *pageP, 
                                       unsigned long address,
                                       enum MmflushOption flags)
{
  pte_t pte = *ptep;
  struct page *page;
  int write_access = 0;
  int error = 0; 

  TRACE4(TRACE_VNODE, 2, TRCID_MMAP_INV_ENTER,
         "invalidate_mmap_page: vma %08X address %08X flag %d pageP %08X\n",
         vma, address, flags,pageP);

  spin_lock(&vma->vm_mm->page_table_lock);
  if (!pte_present(pte))
  {
    TRACE0(TRACE_VNODE, 2, TRCID_MMAP_INV_PRESENT,
           "invalidate_mmap_page: pte not present\n");
    goto unlockpage;
  }
  if (pte_none(pte))
  {
    TRACE0(TRACE_VNODE, 2, TRCID_MMAP_INV_PTE_NONE,
           "invalidate_mmap_page: pte none\n");
    goto unlockpage;
  }
  
  if (vma->vm_flags & VM_WRITE)
     write_access++;
  page = pte_page(pte);
  if (page == pageP)
  {
    TRACE3(TRACE_VNODE, 2, TRCID_MMAP_INVALIDATE_PAGE,
           "Invalidating page 0x%0lX page count %d pte_dirty %d \n",
           pageP,page_count(pageP),pte_dirty(pte)?1:0);
    
    if (flags == MmfInvalidate ||
        flags == MmfDiscard ||
        flags == MmfCacheInval)
    { 
#if defined(MMAP_LINUX_PATCH) && (LINUX_KERNEL_VERSION >= 2041803)
	  page_remove_rmap(page,ptep);
#endif

      pte_clear(ptep);

#if LINUX_KERNEL_VERSION >= 2040200
      FLUSH_TLB_PAGE(vma, address);
#endif

      /* Since this page is still in inode chache, page fault handler finds
         it from the cache and reads from disk only if the page is not 
         uptodate. Clear page uptodate bit so that next time when it is 
         referenced, page fault handler reads it from disk.
      */
      if (Page_Uptodate(page) && write_access && (vma->vm_flags & VM_SHARED))
         ClearPageUptodate(page);
      
      if (page_count(page) < 2)
         BUG();
        page_cache_release(page);
    }
    if (pte_dirty(pte) && write_access && page->mapping->a_ops->writepage)
    {
       TRACE2(TRACE_VNODE, 2, TRCID_MMAP_DIRTY_PAGE,
              "invalidate_mmap_page: dirty page 0x%0lX page count %d\n",
              page, page_count(page));

#if LINUX_KERNEL_VERSION >= 2040000
       error = page->mapping->a_ops->writepage(page);
       wait_on_page(page);
#else
       error = page->mapping->a_ops->writepage(vma->vm_file,page);
#endif
       goto unlockpt;
    }
  }

  unlockpage:
     UnlockPage(pageP);
  unlockpt:
     spin_unlock(&vma->vm_mm->page_table_lock);
  return error;
}

int
pagerKprocMainThreadP(void *iparms)
{
  /* Release all user resources, including the file table */
  daemonize();
  strcpy(current->comm, "mmkproc");

  return gpfs_ops.pagerKprocMainThread(iparms);
}

static void AddToList(cxibuf_t *nextbufP, cxibuf_t * head)
{
  head->av_back->av_forw = nextbufP;
  nextbufP->av_back = head->av_back;
  nextbufP->av_forw = head;
  head->av_back = nextbufP;
}

static cxibuf_t *getFreeEntry()
{
  cxibuf_t *nextEntry = NULL;
  nextEntry = NextFreePageEntry.av_forw;
  if (nextEntry != &NextFreePageEntry)
  {
    /* Detach this entry and set next free entry */
    nextEntry->av_back->av_forw = nextEntry->av_forw;
    nextEntry->av_forw->av_back = nextEntry->av_back;
    return nextEntry;
  }
  else
    return NULL;
}

static void waitForFreeEntry()
{
  TRACE0(TRACE_VNODE, 2, TRCID_MMAP_WAIT_ENTER,
         "waitForFreeEntry() enter\n");
  waitingForFreeEntries = true;
  e_sleep_thread(&FreeEventWord, &pagerLock, LOCK_HANDLER);
  waitingForFreeEntries = false;
  TRACE0(TRACE_VNODE, 2, TRCID_MMAP_WAIT_EXIT,
         "waitForFreeEntry() exit\n");
}

int RegisterPagerBackEnd()
{
  int i, rc = 0;

  TRACE0(TRACE_VNODE, 2, TRCID_MMAP_REG_ENTER,
         "RegisterPagerBackEnd enter\n");

  /* Initialize Next and last free page entries. At the begining all
     entries in Page_queue are available and the NextFreePageEntry
     points to the begining of the array. When a page arrives for
     read or write (by readpage or writepage functions), that page
     information will be copied to NextFreePageEntry and that entry
     will be added to the end of the PageRunQueue.
     The NextFreePageEntry will be moved to the next available entry.
   */

  NextFreePageEntry.av_forw = NextFreePageEntry.av_back = &NextFreePageEntry;
  PageRunQueue.av_forw = PageRunQueue.av_back = &PageRunQueue;

 /* Initialize page queue. Make all the Page_queue array entries as
    a double linked list and mark them as free entries by adding the
    list to NextFreePageEntry */

  for (i = 0; i<MAX_PAGEQUE_ENTRIES; i++)
  {
    AddToList(&Page_queue[i],&NextFreePageEntry);
    Page_queue[i].pageP = NULL;
    Page_queue[i].b_vp = NULL;
    Page_queue[i].vinfoP = NULL;
    Page_queue[i].b_baddr = NULL;
    Page_queue[i].b_flags = 0;
    Page_queue[i].b_blkno = 0;
  }

  TRACE1(TRACE_VNODE, 2, TRCID_MMAP_REG_EXIT,
         "RegisterPagerBackEnd exit: rc %d\n", rc);
  return rc;

}

/* Module termination */
int UnregisterPagerBackEnd()
{
  int rc = 0;
  TRACE0(TRACE_VNODE, 2, TRCID_MMAP_UNREG_ENTER,
         "UnregisterPagerBackEnd enter\n");
  NextFreePageEntry.av_forw = NextFreePageEntry.av_back = &NextFreePageEntry;
  PageRunQueue.av_forw = PageRunQueue.av_back = &PageRunQueue;

  TRACE1(TRACE_VNODE, 2, TRCID_MMAP_UNREG_EXIT,
         "UnregisterPagerBackEnd exit: rc %d\n", rc);
  return 0;
}

gpfs_Kpid_t
Createpagrkproc(void *iparms)
{
  gpfs_Kpid_t pid = -1;
  int rc = 0;

  /* Create the process */
  pid = kernel_thread(pagerKprocMainThreadP, iparms,
		      CLONE_FS | CLONE_FILES | SIGCHLD);
  TRACE2(TRACE_VNODE, 2, TRCID_MMAP_CREATE_EXIT,
         "Createpagrkproc exit: pid %d rc %d\n", pid, rc);
  return pid;
}

Int64 getFilePos(cxibuf_t *bufP)
{
  struct page *pageP = (struct page *)bufP->pageP;
  Int64 pos = ((Int64) pageP->index) << PAGE_CACHE_SHIFT;
  TRACE1(TRACE_VNODE, 2, TRCID_MMAP_FILEPOS_ENTER,
         "getFilePos: pos 0x%llX\n", pos);
  return pos;
}

/* This routine is called by the virtual memory manager when a page
   fault occurs on a mapped file. All it does is place the page onto
   a list and wake up pager kproc. */

int gpfsRWpage(struct gpfsVfsData_t *privVfsP, cxiNode_t *cnP,
               struct MMFSVInfo *vinfoP,
               caddr_t kaddr, void * page, int write_req)
{
  int oldIntLvl;
  cxibuf_t *nextbufP;
  int rc = 0;
  struct page *pageP = (struct page *)page;

  /* Lock out pager kprocs while we modify the buf list */

  TRACE5(TRACE_VNODE, 2, TRCID_MMAP_RW_ENTER,
         "gpfsRWpage: %08x %08x %08x kaddr %08x req_type %s\n",
         pageP, PAGE_CACHE_SIZE, pageP->index, kaddr,
         write_req?"write":"read");

  oldIntLvl = disable_lock(INTPAGER, &pagerLock);

  /* Remember that there were paging requests under the given instance */
  if (vinfoP != NULL)
    ((cxiVinfo_t *)vinfoP)->rwPageDone = true;

  /* Check that the kprocs are running and operation has not been aborted. */
  try_again:
  if (kprocsRunning && !mmapAbort)
  {
    /* Everything is normal, so add these bufs to the tail of the page
       run queue. */

    while (true)
    {
      nextbufP = getFreeEntry();
      if (nextbufP)
      {
        if (write_req)
          nextbufP->b_flags = B_WRITE;
        else
          nextbufP->b_flags = B_READ;
        nextbufP->pageP = pageP;
        nextbufP->b_baddr = kaddr;
        nextbufP->b_bcount = PAGE_SIZE;
        nextbufP->b_vp = cnP;
        nextbufP->vinfoP = vinfoP;
        nextbufP->privVfsP = privVfsP;
        nextbufP->b_blkno = (PAGE_SIZE/CXI_DEV_BSIZE)*pageP->index;
        AddToList(nextbufP,&PageRunQueue);
        break;
      }
      else
      {
        /* O.K no free entries. Wakeup kproc to process pages in
           runq and wait for a free entry.
         */
        cxiWaitEventWakeup(&pagerEventWord);
        waitForFreeEntry();
        goto try_again;
      }
    }
    /* Wake up a pager kproc to process the request */
    cxiWaitEventWakeup(&pagerEventWord);
    unlock_enable(oldIntLvl, &pagerLock);
  }
  else
  {
    /* The kprocs are not running or mapping has been aborted.  Either the
       daemon has not yet been started or it has been terminated.  Return
       all requests with an error. */
    unlock_enable(oldIntLvl, &pagerLock);
    rc = 1;
  }
  TRACE1(TRACE_VNODE, 2, TRCID_MMAP_RW_EXIT,
         "gpfsRWpage: returning with rc = %d\n", rc);
  return rc;
}

void
IoDone(cxibuf_t *bufP)
{
  int oldIntLvl;
  struct page *pageP = (struct page *)bufP->pageP;
  TRACE5(TRACE_VNODE, 2, TRCID_MMAP_IO_ENTER,
         "IoDone enter: pageP %08X, BufP->b_flags 0x%X count = %d "
         "page flags %08X index %d\n",
         pageP, bufP->b_flags, page_count(pageP), pageP->flags,pageP->index);

  if ((bufP->b_flags & B_ERROR) != 0)
    ClearPageUptodate(pageP);
  else if ((bufP->b_flags & B_READ) != 0)
    SetPageUptodate(pageP);

  kunmap(pageP);
  UnlockPage(pageP);
  put_page(pageP);
  TRACE3(TRACE_VNODE, 2, TRCID_MMAP_IO_EXIT,
         "IoDone exit: pageP %08X, count = %d page flags %08X\n",
         pageP, page_count(pageP), pageP->flags);
  oldIntLvl = disable_lock(INTPAGER, &pagerLock);
  AddToList(bufP, &NextFreePageEntry);

  /* wake up threads waiting for free entries */
  cxiWaitEventWakeup(&FreeEventWord);
  unlock_enable(oldIntLvl, &pagerLock);
}


/* Wake up the kproc master thread */
void wakeKprocMaster(cxiPid_t pid)
{
  kill_proc(pid, SIGUSR1, 1);
}

cxibuf_t *
getHeadQueue()
{
  return &PageRunQueue;
}

void
getVp(void *gnP, void **vP, struct gpfsVfsData_t  **privVfsP)
{
  cxiNode_t *cP = (cxiNode_t *)gnP;
  struct inode *iP = (struct inode *)cP->osNodeP;
  *privVfsP = VP_TO_PVP(iP);
  *vP = cP->osNodeP;
}

void flush_vma(cxiNode_t *gnP, struct vm_area_struct *vma,
               unsigned long address, unsigned long end,
               enum MmflushOption mmfopt)
{
  pgd_t * dir;
  pgd_t *pgd; pmd_t *pmd; pte_t *pte;
  pte_t entry;
  int error = 0;
  struct page *page;
  unsigned int flags = 0;
  unsigned long start_address = address;
  size_t size = end - address;
  unsigned long  index;
  struct file *file = vma->vm_file;
  struct inode *inode = file->f_dentry->d_inode;
  struct address_space *mapping = inode->i_mapping;
  Boolean need_unlock = false;

  TRACE3(TRACE_VNODE, 2, TRCID_FLUSH_VMA_ENTER,
         "flush_vma: vma %lX address %lX size %d\n", vma, address, size);

  if (vma->vm_file == NULL)
    return;

  get_file(vma->vm_file);

  if (address >= end)
  {
    TRACE0(TRACE_VNODE, 2, TRCID_FLUSH_VMA_BAD_ADDR,
           "flush_vma: bad address\n");
    error = 1;
    goto exit;
  }
  
  /* find page table directory entry */
  spin_lock(&vma->vm_mm->page_table_lock); 
  TRACE1(TRACE_VNODE, 6, TRCID_FLUSH_VMA_PGT_LOCK,
           "flush_vma: got vma %08X page_table_lock\n",vma);
   need_unlock = true;
 
   dir = pgd_offset(vma->vm_mm, address);
   if (pgd_none(*dir))
   {
       TRACE0(TRACE_VNODE, 6, TRCID_FLUSH_VMA_BAD_PGD, 
           "flush_vma: pgd none\n");
       goto exit;
   }
    
  do
  {
    index = ((address - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
    TRACE4(TRACE_VNODE, 6, TRCID_PAGE_LOCK,
           "flush_vma: getting locked page. vma 0x%lX address 0x%lX end 0x%lX index %d\n",
            vma,address,end,index);
    pmd = pmd_offset(dir,address);
    if (pmd_none(*pmd))
    {
       TRACE0(TRACE_VNODE, 6, TRCID_FLUSH_VMA_BAD_PMD, 
               "flush_vma: pmd none\n");
       goto next_page;
    }
    pte = pte_offset(pmd,address);
    if (!pte_present(*pte))
    {
        TRACE0(TRACE_VNODE, 6, TRCID_FLUSH_VMA_BAD_PTE, 
		      "flush_vma: pte not present\n");
        goto next_page;
    }
    if (pte_none(*pte))
    {
        TRACE0(TRACE_VNODE, 6, TRCID_FLUSH_VMA_NONE_PTE,
            "flush_vma: pte none\n");
        goto next_page;
     }

     entry = *pte;
     page = pte_page(entry);
    
    /* make sure this page belongs to this mapping. */
    
    if (page == NULL)
      goto next_page; 

    if ( (page->mapping != mapping) || (page->index != index) )
      goto next_page;

    /* We do not invalidate locked pages */
    if (!PageLocked(page))
    {
      page_cache_get(page);
      TRACE1(TRACE_VNODE, 6, TRCID_FLUSH_VMA_PAGE_LOCK,
          "flush_vma: getting page %08X lock\n",page);

      /* do not hold spin lock while getting blocking lock */
      spin_unlock(&vma->vm_mm->page_table_lock);
      lock_page(page);
      error |= invalidate_mmap_page(pte, vma, page,address,mmfopt);
#if LINUX_KERNEL_VERSION >= 2041000
      page_cache_release(page);
#else
      page_cache_free(page);
#endif
      spin_lock(&vma->vm_mm->page_table_lock); 
    }
    next_page:
    address += PAGE_SIZE;

  } while (address && (address < end));

exit:
  if (need_unlock)
  { 
    spin_unlock(&vma->vm_mm->page_table_lock);
    TRACE1(TRACE_VNODE, 6, TRCID_FLUSH_VMA_PGT_UNLOCK,
           "flush_vma: releasing vma %08X page_table_lock\n",vma);
  }
  fput(vma->vm_file);
  TRACE1(TRACE_VNODE, 2, TRCID_FLUSH_VMA_EXIT,
         "flush_vma:error %d\n", error);
}

void flushVma(cxiNode_t *gnP, struct vm_area_struct *vma,
               UInt64 start, UInt64 end,
               enum MmflushOption mmfopt)
{
  unsigned long offset, start_address, end_address;
  struct mm_struct *mm;
  Boolean lockheld = false;

  struct inode *inodeP = (struct inode *) gnP->osNodeP;
  if (vma == NULL)
	return;

  do
  {
    TRACE2(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_1,
           "flushVma: vma = %08X, i_no = %d\n",
           vma, vma->vm_file->f_dentry->d_inode->i_ino);
    TRACE2(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_2,
           "flushVma: vma_start = %08X, vma_end = %08X\n",
           vma->vm_start, vma->vm_end);

    /* It should never happen */
    if (inodeP->i_ino != vma->vm_file->f_dentry->d_inode->i_ino)
      continue;

    /* No need to flush if range is outside the limits of what can be
       mapped */

    offset = vma->vm_pgoff << PAGE_SHIFT;
    if (start >= (vma->vm_end - vma->vm_start)  || end <= offset)
      continue;

    if (start <= offset)
      start_address = vma->vm_start;
    else
      start_address = vma->vm_start + start ;

    if (end >= (vma->vm_end - vma->vm_start) )
      end_address = vma->vm_end;
    else
      end_address = vma->vm_start + end;

    TRACE2(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_3,
           "flushVma: start_address %lX end_address %lX\n",
           start_address, end_address);

    /* flush this vma */
    if (vma->vm_flags & VM_DENYWRITE && vma->vm_flags & VM_WRITE)
    {
      TRACE1(TRACE_VNODE, 3, TRCID_MMAP_FLUSH_4,
             "do not flush vma 0x%lX\n",vma);
      continue;
    }

    flush_vma(gnP, vma, start_address, end_address, mmfopt);
  } while ((vma = vma->vm_next_share) != NULL);
}

/* Flush Options:
       MmfKeep       - Leave pages as they are
       MmfProtect    - Change page protection to read-only
       MmfInvalidate - Invalidate the pages
       MmfTerminate  - Invalidate the pages and wait for I/O to complete
       MmfDiscard    - Invalidate pages without flushing them
 */
MMFlushRange(void *gnPP, UInt64 start, UInt64 end, enum MmflushOption mmfopt)
{
  int rc = 0;
  struct vm_area_struct *vma, *shared_vma = NULL;
  cxiNode_t *gnP = (cxiNode_t *)gnPP;
  struct inode *inodeP;
  struct address_space *mapping;
  char buf[] = "MMFlushRange()";

  DBGASSERT(gnP != NULL);

  inodeP = (struct inode *) gnP->osNodeP;
  mapping = inodeP->i_mapping;

  TRACE4(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_ENTER,
         "MMFlushRange: gnP %08X inode number = %d, mmopt %d nrpages %d\n",
         gnP, inodeP->i_ino, mmfopt,inodeP->i_data.nrpages);
  TRACE2(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_ENTER_A,
         "MMFlushRange: start 0x%lX end 0x%lX\n",
         (start &PAGE_MASK),(end & PAGE_MASK));

  /* Since there is no good interface to remove a range/single page(s) 
     from page table caches ( even the locks that are needed to remove 
     pages from page table caches are not exported), at least for this 
     release I am forced to use the only available interface 
     invalidate_inode_pages() which removes mmapped pages of an inode
     that are not locked from page table caches.  I know it is a 
     performance issue, but I have no choice.  For this reason I have
     to check/invalidate pte entries of all mmaped pages of this inode
     that are not locked. 
  */
  start = 0;
  end = inodeP->i_size;
  mmapFlushLock(gnP,buf);
  spin_lock(&mapping->i_shared_lock);

  vma = mapping->i_mmap;
  flushVma(gnP,vma,start,end,mmfopt);
# if LINUX_KERNEL_VERSION >= 2040000
  shared_vma = mapping->i_mmap_shared;
  flushVma(gnP,shared_vma,start,end,mmfopt);
#endif
  spin_unlock(&mapping->i_shared_lock);
  mmapFlushUnLock(gnP,buf);
  
  if (mmfopt != MmfKeep)
  {
    TRACE2(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_INVAL1,
           "Invalidating inode pages of inode %d nrpages %d\n",
            inodeP->i_ino,inodeP->i_data.nrpages);
    /* First flush all dirty pages if any */
#if LINUX_KERNEL_VERSION < 2041800
    flush_dirty_pages(inodeP);
#else
	/* flush all dirty mmaped pages */
    filemap_fdatasync(inodeP->i_mapping);
    /* wait until all IO's are finished */
    filemap_fdatawait(inodeP->i_mapping);
#endif
    invalidate_inode_pages(inodeP);
    TRACE2(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_INVAL2,
           "Invalidated inode pages of inode %d nrpages %d\n",
            inodeP->i_ino,inodeP->i_data.nrpages);
  }
exit:
  TRACE1(TRACE_VNODE, 2, TRCID_MMAP_FLUSH_EXIT,
         "MMFlushRange exit: rc %d\n", rc);
  return rc;
}
